An Exception was encountered at 'In [7]'.
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/prepped_data"
output_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
Execution using papermill encountered an exception here and stopped:
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10
--------------------------------------------------------------------------- NotFittedError Traceback (most recent call last) /tmp/ipykernel_11904/3138475753.py in <module> 23 # across all data splits 24 MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits) ---> 25 MyMultiTrainTester.train(X_inp, y) 26 # save results 27 outfile = open(out_path,'wb') ~/src/MicroBiome.py in train(self, X, y, **args) 628 TrainTesterCopy = copy.deepcopy(self.template) 629 TrainTesterCopy.rand_state = seed_sequence[i] --> 630 TrainTesterCopy.train(X, y, **args) 631 self.train_scores.append(TrainTesterCopy.train_score) 632 self.test_scores.append(TrainTesterCopy.test_score) ~/src/MicroBiome.py in train(self, X, y, do_validation, **args) 535 self.history = history 536 else: --> 537 self.Trainer.fit(X_train, y_train, **args) 538 539 ~/src/MicroBiome.py in fit(self, X, y, epochs, class_weight, batch_size, validation_data) 428 429 if validation_data is None: --> 430 self.model.fit(X, y) 431 else: 432 self.model.fit(X, y, epochs=epochs, class_weight=class_weight, batch_size=batch_size, validation_data=validation_data) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates) 1294 def _run_search(self, evaluate_candidates): 1295 """Search all candidates in param_grid""" -> 1296 evaluate_candidates(ParameterGrid(self.param_grid)) 1297 1298 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results) 825 # of out will be done in `_insert_error_scores`. 826 if callable(self.scoring): --> 827 _insert_error_scores(out, self.error_score) 828 all_candidate_params.extend(candidate_params) 829 all_out.extend(out) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _insert_error_scores(results, error_score) 299 300 if successful_score is None: --> 301 raise NotFittedError("All estimators failed to fit") 302 303 if isinstance(successful_score, dict): NotFittedError: All estimators failed to fit
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
An Exception was encountered at 'In [7]'.
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/prepped_data"
output_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
Execution using papermill encountered an exception here and stopped:
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /tmp/ipykernel_8952/3138475753.py in <module> 23 # across all data splits 24 MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits) ---> 25 MyMultiTrainTester.train(X_inp, y) 26 # save results 27 outfile = open(out_path,'wb') ~/src/MicroBiome.py in train(self, X, y, **args) 628 TrainTesterCopy = copy.deepcopy(self.template) 629 TrainTesterCopy.rand_state = seed_sequence[i] --> 630 TrainTesterCopy.train(X, y, **args) 631 self.train_scores.append(TrainTesterCopy.train_score) 632 self.test_scores.append(TrainTesterCopy.test_score) ~/src/MicroBiome.py in train(self, X, y, do_validation, **args) 535 self.history = history 536 else: --> 537 self.Trainer.fit(X_train, y_train, **args) 538 539 ~/src/MicroBiome.py in fit(self, X, y, epochs, class_weight, batch_size, validation_data) 428 429 if validation_data is None: --> 430 self.model.fit(X, y) 431 else: 432 self.model.fit(X, y, epochs=epochs, class_weight=class_weight, batch_size=batch_size, validation_data=validation_data) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 878 refit_start_time = time.time() 879 if y is not None: --> 880 self.best_estimator_.fit(X, y, **fit_params) 881 else: 882 self.best_estimator_.fit(X, **fit_params) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 339 """ 340 fit_params_steps = self._check_fit_params(**fit_params) --> 341 Xt = self._fit(X, y, **fit_params_steps) 342 with _print_elapsed_time('Pipeline', 343 self._log_message(len(self.steps) - 1)): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps) 305 message_clsname='Pipeline', 306 message=self._log_message(step_idx), --> 307 **fit_params_steps[name]) 308 # Replace the transformer of the step with the fitted 309 # transformer. This is necessary when loading the transformer /src/anaconda/envs/microbiome/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs) 350 351 def __call__(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs): /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 752 with _print_elapsed_time(message_clsname, message): 753 if hasattr(transformer, 'fit_transform'): --> 754 res = transformer.fit_transform(X, y, **fit_params) 755 else: 756 res = transformer.fit(X, y, **fit_params).transform(X) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 385 fit_params_last_step = fit_params_steps[self.steps[-1][0]] 386 if hasattr(last_step, 'fit_transform'): --> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step) 388 else: 389 return last_step.fit(Xt, y, /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in fit_transform(self, X, y) 381 C-ordered array, use 'np.ascontiguousarray'. 382 """ --> 383 U, S, Vt = self._fit(X) 384 U = U[:, :self.n_components_] 385 /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in _fit(self, X) 428 # Call different fits for either full or truncated SVD 429 if self._fit_svd_solver == 'full': --> 430 return self._fit_full(X, n_components) 431 elif self._fit_svd_solver in ['arpack', 'randomized']: 432 return self._fit_truncated(X, n_components, self._fit_svd_solver) /src/anaconda/envs/microbiome/lib/python3.7/site-packages/sklearn/decomposition/_pca.py in _fit_full(self, X, n_components) 447 "min(n_samples, n_features)=%r with " 448 "svd_solver='full'" --> 449 % (n_components, min(n_samples, n_features))) 450 elif n_components >= 1: 451 if not isinstance(n_components, numbers.Integral): ValueError: n_components=3 must be between 0 and min(n_samples, n_features)=1 with svd_solver='full'
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/prepped_data"
output_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.668852 | train |
| 1 | 0.743902 | train |
| 2 | 0.719033 | train |
| 3 | 0.664577 | train |
| 4 | 0.692810 | train |
| 5 | 0.692810 | train |
| 6 | 0.726115 | train |
| 7 | 0.652482 | train |
| 8 | 0.681818 | train |
| 9 | 0.733119 | train |
| 0 | 0.738095 | test |
| 1 | 0.729412 | test |
| 2 | 0.705882 | test |
| 3 | 0.658537 | test |
| 4 | 0.651163 | test |
| 5 | 0.651163 | test |
| 6 | 0.725000 | test |
| 7 | 0.764045 | test |
| 8 | 0.722892 | test |
| 9 | 0.641975 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0. , 0. , 0. , ..., 0.13, 0. , 2.13],
[0. , 0.15, 0.07, ..., 0.02, 0. , 0.49],
[0. , 0.02, 0. , ..., 0.04, 0. , 0.07],
...,
[0.02, 0. , 0. , ..., 0.33, 0.02, 0.22],
[0. , 0. , 0. , ..., 0.14, 0. , 0.17],
[0. , 0. , 0. , ..., 0.34, 0. , 0.25]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 11 | 3 |
| 1 | 22026.465795 | 12 | 3 |
| 2 | 22026.465795 | 11 | 3 |
| 3 | 22026.465795 | 12 | 3 |
| 4 | 22026.465795 | 9 | 3 |
| 5 | 22026.465795 | 9 | 3 |
| 6 | 22026.465795 | 20 | 3 |
| 7 | 22026.465795 | 4 | 3 |
| 8 | 22026.465795 | 13 | 3 |
| 9 | 22026.465795 | 18 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.660579 | 0.641417 | 0.042306 |
| 1 | AUPRC_POS | 0.778491 | 0.790404 | 0.047639 |
| 2 | AUROC_NEG | 0.731389 | 0.736382 | 0.032114 |
| 3 | AUROC_POS | 0.731389 | 0.736382 | 0.032114 |
| 4 | f1_score | 0.698816 | 0.714387 | 0.041766 |
| 5 | npv_score | 0.603176 | 0.611455 | 0.080635 |
| 6 | ppv_score | 0.740441 | 0.725330 | 0.042753 |
| 7 | sensitivity | 0.663785 | 0.666667 | 0.056054 |
| 8 | specificity | 0.686719 | 0.676548 | 0.059668 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/prepped_data"
output_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.732484 | train |
| 1 | 0.743590 | train |
| 2 | 0.781726 | train |
| 3 | 0.735905 | train |
| 4 | 0.735484 | train |
| 5 | 0.735484 | train |
| 6 | 0.754902 | train |
| 7 | 0.728863 | train |
| 8 | 0.738155 | train |
| 9 | 0.734177 | train |
| 0 | 0.738095 | test |
| 1 | 0.717949 | test |
| 2 | 0.719101 | test |
| 3 | 0.764045 | test |
| 4 | 0.758621 | test |
| 5 | 0.758621 | test |
| 6 | 0.660194 | test |
| 7 | 0.762887 | test |
| 8 | 0.759259 | test |
| 9 | 0.691358 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0.3 , 0. , ..., 0. , 0.04, 0. ],
[0. , 0.02, 0.02, ..., 0. , 0. , 0.09],
[0.01, 0.47, 0. , ..., 0. , 0.01, 0.02],
...,
[0. , 0.01, 0. , ..., 0. , 0.13, 0.04],
[0. , 0.01, 0. , ..., 0. , 0.05, 0.08],
[0. , 0.06, 0. , ..., 0. , 0.19, 2.44]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 22026.465795 | 21 | 3 |
| 1 | 0.018316 | 31 | 3 |
| 2 | 0.006738 | 20 | 3 |
| 3 | 22026.465795 | 16 | 3 |
| 4 | 22026.465795 | 25 | 3 |
| 5 | 22026.465795 | 25 | 3 |
| 6 | 0.006738 | 29 | 3 |
| 7 | 22026.465795 | 6 | 3 |
| 8 | 0.006738 | 18 | 3 |
| 9 | 0.135335 | 28 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.681669 | 0.673158 | 0.044460 |
| 1 | AUPRC_POS | 0.792750 | 0.801981 | 0.049796 |
| 2 | AUROC_NEG | 0.746453 | 0.751949 | 0.024166 |
| 3 | AUROC_POS | 0.746453 | 0.751949 | 0.024166 |
| 4 | f1_score | 0.733013 | 0.748358 | 0.033675 |
| 5 | npv_score | 0.648883 | 0.621622 | 0.065651 |
| 6 | ppv_score | 0.723914 | 0.756364 | 0.093782 |
| 7 | sensitivity | 0.764106 | 0.728842 | 0.090397 |
| 8 | specificity | 0.588401 | 0.676945 | 0.197664 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/prepped_data"
output_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.730000 | train |
| 1 | 0.729730 | train |
| 2 | 0.752941 | train |
| 3 | 0.696864 | train |
| 4 | 0.726667 | train |
| 5 | 0.726667 | train |
| 6 | 0.726688 | train |
| 7 | 0.712329 | train |
| 8 | 0.748538 | train |
| 9 | 0.751634 | train |
| 0 | 0.734694 | test |
| 1 | 0.702703 | test |
| 2 | 0.693333 | test |
| 3 | 0.829268 | test |
| 4 | 0.666667 | test |
| 5 | 0.666667 | test |
| 6 | 0.695652 | test |
| 7 | 0.750000 | test |
| 8 | 0.772727 | test |
| 9 | 0.674699 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0. , 0.26, ..., 0.05, 0.13, 0. ],
[0.01, 0.01, 0.04, ..., 0.03, 0.04, 0.1 ],
[0.02, 0.01, 0.41, ..., 0.05, 0.3 , 0.02],
...,
[0.04, 0. , 0. , ..., 0.15, 3.95, 0.03],
[0.06, 0. , 0. , ..., 0.07, 6.66, 0.1 ],
[0.24, 0. , 0.05, ..., 0.14, 3.02, 2.26]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.006738 | 34 | 3 |
| 1 | 2.718282 | 59 | 3 |
| 2 | 0.006738 | 33 | 3 |
| 3 | 22026.465795 | 24 | 3 |
| 4 | 0.135335 | 41 | 3 |
| 5 | 0.135335 | 41 | 3 |
| 6 | 0.006738 | 59 | 3 |
| 7 | 0.049787 | 32 | 3 |
| 8 | 0.006738 | 38 | 3 |
| 9 | 22026.465795 | 52 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.708945 | 0.709353 | 0.061106 |
| 1 | AUPRC_POS | 0.830226 | 0.839137 | 0.052192 |
| 2 | AUROC_NEG | 0.781356 | 0.771767 | 0.035092 |
| 3 | AUROC_POS | 0.781356 | 0.771767 | 0.035092 |
| 4 | f1_score | 0.718641 | 0.699177 | 0.050161 |
| 5 | npv_score | 0.625896 | 0.622530 | 0.071093 |
| 6 | ppv_score | 0.766935 | 0.773727 | 0.072994 |
| 7 | sensitivity | 0.682805 | 0.684397 | 0.070906 |
| 8 | specificity | 0.712198 | 0.699666 | 0.120027 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/prepped_data"
output_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.718121 | train |
| 1 | 0.720000 | train |
| 2 | 0.729560 | train |
| 3 | 0.724719 | train |
| 4 | 0.734007 | train |
| 5 | 0.734007 | train |
| 6 | 0.708333 | train |
| 7 | 0.714286 | train |
| 8 | 0.732673 | train |
| 9 | 0.740984 | train |
| 0 | 0.780488 | test |
| 1 | 0.702703 | test |
| 2 | 0.727273 | test |
| 3 | 0.787234 | test |
| 4 | 0.689655 | test |
| 5 | 0.689655 | test |
| 6 | 0.646154 | test |
| 7 | 0.823529 | test |
| 8 | 0.756098 | test |
| 9 | 0.716049 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0. , 0.26, ..., 0.11, 0. , 0. ],
[0.01, 0.01, 0.05, ..., 0.03, 0. , 0.12],
[0.01, 0.01, 0.43, ..., 0.25, 0. , 0.01],
...,
[0.02, 0. , 0. , ..., 3.94, 0. , 0.05],
[0.06, 0. , 0.01, ..., 6.59, 0. , 0.11],
[0.29, 0. , 0.05, ..., 3.04, 0. , 2.36]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 2.718282 | 71 | 3 |
| 1 | 22026.465795 | 87 | 3 |
| 2 | 0.367879 | 67 | 3 |
| 3 | 0.006738 | 54 | 3 |
| 4 | 1.000000 | 79 | 3 |
| 5 | 1.000000 | 79 | 3 |
| 6 | 0.006738 | 80 | 3 |
| 7 | 1.000000 | 62 | 3 |
| 8 | 0.367879 | 66 | 3 |
| 9 | 22026.465795 | 90 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.724282 | 0.720087 | 0.052607 |
| 1 | AUPRC_POS | 0.840898 | 0.857181 | 0.050441 |
| 2 | AUROC_NEG | 0.793811 | 0.800000 | 0.038876 |
| 3 | AUROC_POS | 0.793811 | 0.800000 | 0.038876 |
| 4 | f1_score | 0.731884 | 0.721661 | 0.051430 |
| 5 | npv_score | 0.641530 | 0.644837 | 0.064656 |
| 6 | ppv_score | 0.804264 | 0.803846 | 0.055249 |
| 7 | sensitivity | 0.675815 | 0.677778 | 0.072661 |
| 8 | specificity | 0.770924 | 0.814991 | 0.097715 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/prepped_data"
output_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.735786 | train |
| 1 | 0.720539 | train |
| 2 | 0.716612 | train |
| 3 | 0.706667 | train |
| 4 | 0.727891 | train |
| 5 | 0.727891 | train |
| 6 | 0.721519 | train |
| 7 | 0.707904 | train |
| 8 | 0.702703 | train |
| 9 | 0.722408 | train |
| 0 | 0.746667 | test |
| 1 | 0.759494 | test |
| 2 | 0.711864 | test |
| 3 | 0.761905 | test |
| 4 | 0.674699 | test |
| 5 | 0.674699 | test |
| 6 | 0.678261 | test |
| 7 | 0.776471 | test |
| 8 | 0.756098 | test |
| 9 | 0.731707 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0. , 0.69, ..., 0.13, 0. , 0. ],
[0.01, 0.02, 0.03, ..., 0.03, 0. , 0.1 ],
[0.02, 0.02, 0.46, ..., 0.28, 0. , 0.02],
...,
[0.03, 0. , 0.01, ..., 3.79, 0. , 0.04],
[0.06, 0. , 0.01, ..., 6.84, 0. , 0.09],
[0.21, 0. , 0.1 , ..., 3.16, 0. , 2.4 ]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.135335 | 80 | 3 |
| 1 | 0.367879 | 94 | 3 |
| 2 | 0.006738 | 78 | 3 |
| 3 | 22026.465795 | 60 | 3 |
| 4 | 22026.465795 | 94 | 3 |
| 5 | 22026.465795 | 94 | 3 |
| 6 | 0.000912 | 107 | 3 |
| 7 | 0.367879 | 85 | 3 |
| 8 | 0.367879 | 84 | 3 |
| 9 | 22026.465795 | 106 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.691685 | 0.711457 | 0.096850 |
| 1 | AUPRC_POS | 0.814878 | 0.850335 | 0.107346 |
| 2 | AUROC_NEG | 0.761919 | 0.791622 | 0.096519 |
| 3 | AUROC_POS | 0.761919 | 0.791622 | 0.096519 |
| 4 | f1_score | 0.727186 | 0.739187 | 0.037486 |
| 5 | npv_score | 0.567764 | 0.636302 | 0.198364 |
| 6 | ppv_score | 0.800267 | 0.815662 | 0.110893 |
| 7 | sensitivity | 0.692774 | 0.695508 | 0.114689 |
| 8 | specificity | 0.724199 | 0.784156 | 0.254022 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/prepped_data"
output_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.695946 | train |
| 1 | 0.709459 | train |
| 2 | 0.698718 | train |
| 3 | 0.715719 | train |
| 4 | 0.682759 | train |
| 5 | 0.682759 | train |
| 6 | 0.721519 | train |
| 7 | 0.682927 | train |
| 8 | 0.706271 | train |
| 9 | 0.702703 | train |
| 0 | 0.750000 | test |
| 1 | 0.736842 | test |
| 2 | 0.656250 | test |
| 3 | 0.752941 | test |
| 4 | 0.666667 | test |
| 5 | 0.666667 | test |
| 6 | 0.678261 | test |
| 7 | 0.734177 | test |
| 8 | 0.717949 | test |
| 9 | 0.683544 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0. , 0.71, ..., 0. , 0. , 0. ],
[0.01, 0.01, 0.03, ..., 0. , 0. , 0. ],
[0.02, 0.02, 0.49, ..., 0. , 0. , 0. ],
...,
[0.03, 0. , 0.01, ..., 0. , 0. , 0. ],
[0.05, 0. , 0.01, ..., 0. , 0. , 0. ],
[0.23, 0. , 0.09, ..., 0. , 0.01, 0.01]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.018316 | 105 | 3 |
| 1 | 0.135335 | 128 | 3 |
| 2 | 0.049787 | 103 | 3 |
| 3 | 22026.465795 | 86 | 3 |
| 4 | 1.000000 | 110 | 3 |
| 5 | 1.000000 | 110 | 3 |
| 6 | 0.000912 | 126 | 3 |
| 7 | 0.018316 | 100 | 3 |
| 8 | 0.367879 | 104 | 3 |
| 9 | 0.367879 | 132 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.691202 | 0.689191 | 0.089171 |
| 1 | AUPRC_POS | 0.810247 | 0.847764 | 0.106226 |
| 2 | AUROC_NEG | 0.757399 | 0.782796 | 0.093710 |
| 3 | AUROC_POS | 0.757399 | 0.782796 | 0.093710 |
| 4 | f1_score | 0.704330 | 0.700747 | 0.035836 |
| 5 | npv_score | 0.548577 | 0.610018 | 0.189812 |
| 6 | ppv_score | 0.781653 | 0.797059 | 0.101921 |
| 7 | sensitivity | 0.665881 | 0.632540 | 0.119177 |
| 8 | specificity | 0.718756 | 0.791075 | 0.245590 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/prepped_data"
output_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.700000 | train |
| 1 | 0.715719 | train |
| 2 | 0.696486 | train |
| 3 | 0.695946 | train |
| 4 | 0.705085 | train |
| 5 | 0.705085 | train |
| 6 | 0.721519 | train |
| 7 | 0.668990 | train |
| 8 | 0.684385 | train |
| 9 | 0.696246 | train |
| 0 | 0.775000 | test |
| 1 | 0.684211 | test |
| 2 | 0.645161 | test |
| 3 | 0.785714 | test |
| 4 | 0.634146 | test |
| 5 | 0.634146 | test |
| 6 | 0.678261 | test |
| 7 | 0.746988 | test |
| 8 | 0.734177 | test |
| 9 | 0.658228 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0. , 0. , ..., 0. , 0. , 0. ],
[0.02, 0.01, 0. , ..., 0. , 0. , 0. ],
[0.01, 0.02, 0. , ..., 0. , 0. , 0. ],
...,
[0.03, 0. , 0. , ..., 0. , 0. , 0. ],
[0.05, 0. , 0. , ..., 0. , 0. , 0. ],
[0.23, 0. , 0. , ..., 0. , 0. , 0.01]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.367879 | 105 | 3 |
| 1 | 1.000000 | 127 | 3 |
| 2 | 0.049787 | 102 | 3 |
| 3 | 0.018316 | 91 | 3 |
| 4 | 22026.465795 | 107 | 3 |
| 5 | 22026.465795 | 107 | 3 |
| 6 | 0.000912 | 122 | 3 |
| 7 | 22026.465795 | 102 | 3 |
| 8 | 1.000000 | 104 | 3 |
| 9 | 0.367879 | 135 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.683280 | 0.696634 | 0.087871 |
| 1 | AUPRC_POS | 0.807317 | 0.842270 | 0.106428 |
| 2 | AUROC_NEG | 0.751336 | 0.781416 | 0.092853 |
| 3 | AUROC_POS | 0.751336 | 0.781416 | 0.092853 |
| 4 | f1_score | 0.697603 | 0.681236 | 0.055171 |
| 5 | npv_score | 0.541766 | 0.609524 | 0.191741 |
| 6 | ppv_score | 0.768143 | 0.766968 | 0.103118 |
| 7 | sensitivity | 0.662627 | 0.631746 | 0.126121 |
| 8 | specificity | 0.701420 | 0.785579 | 0.243622 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/prepped_data"
output_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.697987 | train |
| 1 | 0.717608 | train |
| 2 | 0.702875 | train |
| 3 | 0.668919 | train |
| 4 | 0.700680 | train |
| 5 | 0.700680 | train |
| 6 | 0.721519 | train |
| 7 | 0.673611 | train |
| 8 | 0.666667 | train |
| 9 | 0.702703 | train |
| 0 | 0.750000 | test |
| 1 | 0.710526 | test |
| 2 | 0.634921 | test |
| 3 | 0.781609 | test |
| 4 | 0.650602 | test |
| 5 | 0.650602 | test |
| 6 | 0.678261 | test |
| 7 | 0.752941 | test |
| 8 | 0.734177 | test |
| 9 | 0.658228 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0.01, 0. , ..., 0. , 0. , 0. ],
[0.02, 0.01, 0. , ..., 0. , 0. , 0. ],
[0.02, 0.02, 0. , ..., 0. , 0. , 0. ],
...,
[0.03, 0. , 0. , ..., 0. , 0. , 0. ],
[0.05, 0. , 0. , ..., 0. , 0. , 0. ],
[0.23, 0. , 0. , ..., 0. , 0.01, 0.01]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.049787 | 113 | 3 |
| 1 | 0.049787 | 126 | 3 |
| 2 | 0.018316 | 103 | 3 |
| 3 | 0.006738 | 97 | 3 |
| 4 | 22026.465795 | 111 | 3 |
| 5 | 22026.465795 | 111 | 3 |
| 6 | 0.000912 | 126 | 3 |
| 7 | 0.049787 | 104 | 3 |
| 8 | 0.018316 | 109 | 3 |
| 9 | 22026.465795 | 128 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.683508 | 0.686421 | 0.082124 |
| 1 | AUPRC_POS | 0.804959 | 0.839226 | 0.105527 |
| 2 | AUROC_NEG | 0.752136 | 0.785025 | 0.091152 |
| 3 | AUROC_POS | 0.752136 | 0.785025 | 0.091152 |
| 4 | f1_score | 0.700187 | 0.694394 | 0.049573 |
| 5 | npv_score | 0.544100 | 0.612155 | 0.190656 |
| 6 | ppv_score | 0.761243 | 0.772059 | 0.096541 |
| 7 | sensitivity | 0.671391 | 0.643651 | 0.123153 |
| 8 | specificity | 0.688736 | 0.767519 | 0.237253 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
output_dir = '/project/results/LR_Classifier_DE_generic'
retrain = True
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/prepped_data"
output_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/LR_DE"
retrain = True
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
X_inp = X
n_splits = 10
out_path = os.path.join(output_dir, 'MyMultiTrainTester.pk')
if retrain:
# clear previous results, if any
if os.path.exists(output_dir):
os.system('rm -rf ' + output_dir)
os.mkdir(output_dir)
# we run PCA on microbiome data's differentially abundant features
MyLogistic = LM.LogisticRegression(random_state=42, class_weight='balanced',
penalty='elasticnet', solver='saga', l1_ratio=0.5)
MatrixPipeLine = Pipeline([('DE0', DiffExpTransform(fdr=fdr_DE)), ('scaler0', StandardScaler()), ('pca', PCA())])
clf = Pipeline([('TransformedData', MatrixPipeLine), ('LR', MyLogistic)])
param_grid = dict(LR__C=np.exp(-np.arange(-10, 10)),
TransformedData__pca__n_components=np.arange(3,4))
model=model_selection.GridSearchCV(clf, param_grid, scoring=metrics.make_scorer(metrics.f1_score), cv = 5)
# Trainer
MyTrainer = Trainer(model=model)
# random seed used in class definition is not used in final output models
MyTrainTester = TrainTester(MyTrainer, metrics.f1_score)
# note that random seed here affects sequence of seeds passed to making new TrainTester objects
# using LRTrainTester as template. Thus, you have all settings but seed affecting sample split
# across all data splits
MyMultiTrainTester = MultiTrainTester(MyTrainTester, numpy_rand_seed=42, n_splits=n_splits)
MyMultiTrainTester.train(X_inp, y)
# save results
outfile = open(out_path,'wb')
pk.dump(MyMultiTrainTester, outfile)
outfile.close()
else:
# load previous results
infile = open(out_path,'rb')
MyMultiTrainTester = pk.load(infile)
infile.close()
Running for split 1 of 10 Using predict_proba getting predictions from probs Running for split 2 of 10 Using predict_proba getting predictions from probs Running for split 3 of 10 Using predict_proba getting predictions from probs Running for split 4 of 10 Using predict_proba getting predictions from probs Running for split 5 of 10 Using predict_proba getting predictions from probs Running for split 6 of 10 Using predict_proba getting predictions from probs Running for split 7 of 10 Using predict_proba getting predictions from probs Running for split 8 of 10 Using predict_proba getting predictions from probs Running for split 9 of 10 Using predict_proba getting predictions from probs Running for split 10 of 10 Using predict_proba getting predictions from probs
scores_df = pd.DataFrame({'score': MyMultiTrainTester.train_scores, 'stage' : np.repeat('train', n_splits)})
scores_df = scores_df.append(pd.DataFrame({'score': MyMultiTrainTester.test_scores, 'stage' : np.repeat('test', n_splits)}))
scores_df
| score | stage | |
|---|---|---|
| 0 | 0.684564 | train |
| 1 | 0.711409 | train |
| 2 | 0.704762 | train |
| 3 | 0.684211 | train |
| 4 | 0.697987 | train |
| 5 | 0.697987 | train |
| 6 | 0.721519 | train |
| 7 | 0.682759 | train |
| 8 | 0.684385 | train |
| 9 | 0.709459 | train |
| 0 | 0.734177 | test |
| 1 | 0.701299 | test |
| 2 | 0.656250 | test |
| 3 | 0.781609 | test |
| 4 | 0.650602 | test |
| 5 | 0.650602 | test |
| 6 | 0.678261 | test |
| 7 | 0.756098 | test |
| 8 | 0.725000 | test |
| 9 | 0.691358 | test |
sns.boxplot(data = scores_df, x = 'stage', y = 'score')
<AxesSubplot:xlabel='stage', ylabel='score'>
i = 1
MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].transform(X_inp)
array([[0.01, 0.01, 0. , ..., 0. , 0. , 0. ],
[0.02, 0.01, 0. , ..., 0. , 0. , 0. ],
[0.01, 0.01, 0. , ..., 0. , 0. , 0. ],
...,
[0.03, 0. , 0. , ..., 0. , 0. , 0. ],
[0.05, 0. , 0. , ..., 0. , 0. , 0. ],
[0.24, 0. , 0. , ..., 0. , 0. , 0.01]], dtype=float32)
# hyperparams = {'l1_ratio': [], 'C': []}
# feats_in_split = []
hyperparams = {'C': [], 'n_feats': [], 'n_components': []}
for i in range(n_splits):
hyperparams['C'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['LR__C'])
selected_feats_i = copy.deepcopy(MyMultiTrainTester.TrainerList[i].model.best_estimator_['TransformedData'].named_steps['DE0'].selected_feats)
# feats_in_split.append(selected_feats_i)
hyperparams['n_feats'].append(np.sum(selected_feats_i))
hyperparams['n_components'].append(MyMultiTrainTester.TrainerList[i].model.best_params_['TransformedData__pca__n_components'])
hyperparams_df = pd.DataFrame(hyperparams)
hyperparams_df
| C | n_feats | n_components | |
|---|---|---|---|
| 0 | 0.018316 | 109 | 3 |
| 1 | 0.049787 | 127 | 3 |
| 2 | 0.135335 | 102 | 3 |
| 3 | 0.049787 | 94 | 3 |
| 4 | 0.367879 | 110 | 3 |
| 5 | 0.367879 | 110 | 3 |
| 6 | 0.000912 | 123 | 3 |
| 7 | 0.049787 | 101 | 3 |
| 8 | 0.049787 | 115 | 3 |
| 9 | 22026.465795 | 128 | 3 |
scoring_metrics = MyMultiTrainTester.getScores()
scoring_metrics_df = pd.DataFrame(scoring_metrics)
scoring_metrics_df.to_csv(os.path.join(output_dir, 'scoring_metrics.csv'))
sns.set(rc={"figure.figsize":(12, 8)})
sns.boxplot(data = scoring_metrics_df, x = 'score_type', y = 'value')
<AxesSubplot:xlabel='score_type', ylabel='value'>
score_names = np.unique(scoring_metrics['score_type'])
score_stats = {'score': [], 'mean': [], 'median': [], 'std_dev': []}
for score in score_names:
score_stats['score'].append(score)
score_vect = scoring_metrics_df['value'].to_numpy()[scoring_metrics_df['score_type'] == score]
score_stats['mean'].append(np.mean(score_vect))
score_stats['median'].append(np.median(score_vect))
score_stats['std_dev'].append(np.std(score_vect))
score_stats_df = pd.DataFrame(score_stats)
score_stats_df
| score | mean | median | std_dev | |
|---|---|---|---|---|
| 0 | AUPRC_NEG | 0.685840 | 0.688271 | 0.082143 |
| 1 | AUPRC_POS | 0.804055 | 0.833811 | 0.104370 |
| 2 | AUROC_NEG | 0.750459 | 0.777912 | 0.090356 |
| 3 | AUROC_POS | 0.750459 | 0.777912 | 0.090356 |
| 4 | f1_score | 0.702526 | 0.696328 | 0.043451 |
| 5 | npv_score | 0.545830 | 0.614402 | 0.191402 |
| 6 | ppv_score | 0.762929 | 0.760714 | 0.097284 |
| 7 | sensitivity | 0.674581 | 0.644444 | 0.119973 |
| 8 | specificity | 0.689465 | 0.753321 | 0.238342 |
score_stats_df.to_csv('score_stats.csv')
MyMultiTrainTester.plot_confusion(normalize=True, figsize=(15,25))
MyMultiTrainTester.plot_confusion(normalize=False, figsize=(15,25))
MyMultiTrainTester.plot_class_freq(normalize=True, figsize=(15,35))
MyMultiTrainTester.plot_precrecall(figsize=(15,35))